library(readxl)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(maps)
library(viridis)
## Loading required package: viridisLite
## 
## Attaching package: 'viridis'
## The following object is masked from 'package:maps':
## 
##     unemp
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::map()    masks maps::map()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DataExplorer)
library(corrplot)
## corrplot 0.95 loaded
library(ggpubr)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
## 
## The following object is masked from 'package:viridis':
## 
##     viridis_pal
data <- read.csv("G:/STA4996_Research/data/Nutrition.csv")
head(data)
colSums(is.na(data))
##                                      Country 
##                                            0 
##                                         Year 
##                                            0 
##                                    UN_Region 
##                                            0 
##                                UN_sub_region 
##                                            0 
##                              Stunting_survey 
##                                          819 
##                               Stunting_model 
##                                            1 
##                           Underweight_survey 
##                                          823 
##                               Wasting_survey 
##                                          822 
##                            Overweight_survey 
##                                          846 
##                             Overweight_model 
##                                            0 
##                              WHZ_sample_size 
##                                          865 
##                              HAZ_sample_size 
##                                          860 
##                              WAZ_sample_size 
##                                          892 
##                      Under5_population..000. 
##                                          824 
##                                       hcpi_a 
##                                            0 
##                                       fcpi_a 
##                                           98 
##                                        def_a 
##                                           72 
##                              Literacy_female 
##                                          825 
##                               Total_literacy 
##                                          834 
##       Current..health.expenditure....of.GDP. 
##                                           67 
##     Urban.population....of.total.population. 
##                                            0 
##     Rural.population....of.total.population. 
##                                            0 
##                       Mortality_rate_under.5 
##                                            0 
##         Low.birthweight.babies....of.births. 
##                                          365 
## Unemployment..total....of.total.labor.force. 
##                                          194
colnames(data)[1:18] <- c("Country", "Year","UN_Region", "UN_sub_region", "Stunting_survey","Stunting_model","Underweight_survey", "Wasting_survey","Overweight_survey","Overweight_model","WHZ_sample_size","HAZ_sample_size","WAZ_sample_size","U5_population", "hcpi_a","fcpi_a","def_a","literacy_female","literacy_tot","health_expenditure","Urban_pop","rural_pop","inf_mortality","LBW","UnemploymentRate")
## Warning in colnames(data)[1:18] <- c("Country", "Year", "UN_Region",
## "UN_sub_region", : number of items to replace is not a multiple of replacement
## length

#Selected countries

selected_countries <- c("Afghanistan", "Bangladesh", "Bolvia", "Botswana", "Brazil", "Chilie", "Colombia","Congo", "Egypt","Ethiopia", "Iran", "India", "Kenya", "Madagascar","Malaysia", "Maldives", "Mali", "Mauritius", "Mexico","Morocco", "Myanmar", "Nepal", "Nigeria", "Pakistan", "Panama", "Peru","South Africa", "Sri Lanka", "Sudan", "Thailand","Venezuela","Vietnam","Zimbabwe", "Albania", "Azerbaijan", "Tajikistan", "Turkey","Rwanda","Niger","Malawi","Mautitania","Jamaica","Indonesia","Uganda","Senegal","Burkina Faso","Burubdi","Chad","Gambia","Kuweit","Togo","Tanzania","Uruguay")
library(ggplot2)
library(rnaturalearth)
library(rnaturalearthdata)
## 
## Attaching package: 'rnaturalearthdata'
## The following object is masked from 'package:rnaturalearth':
## 
##     countries110
library(sf)
## Linking to GEOS 3.13.1, GDAL 3.11.0, PROJ 9.6.0; sf_use_s2() is TRUE
world <- ne_countries(scale = "medium", returnclass = "sf")


world$highlight <- ifelse(world$name %in% selected_countries, "Selected", "Other")


ggplot(data = world) +
  geom_sf(aes(fill = highlight)) +
  scale_fill_manual(values = c("Selected" = "tomato", "Other" = "gray90")) +
  theme_minimal() +
  labs(title = "Highlighted Countries on World Map",
       fill = "Country Type")

#Missing values

library(naniar)
vis_miss(data)          

miss_var_summary(data)  
colMeans(is.na(data)) * 100
##                                      Country 
##                                   0.00000000 
##                                         Year 
##                                   0.00000000 
##                                    UN_Region 
##                                   0.00000000 
##                                UN_sub_region 
##                                   0.00000000 
##                              Stunting_survey 
##                                  67.18621821 
##                               Stunting_model 
##                                   0.08203445 
##                           Underweight_survey 
##                                  67.51435603 
##                               Wasting_survey 
##                                  67.43232158 
##                            Overweight_survey 
##                                  69.40114848 
##                             Overweight_model 
##                                   0.00000000 
##                              WHZ_sample_size 
##                                  70.95980312 
##                              HAZ_sample_size 
##                                  70.54963084 
##                              WAZ_sample_size 
##                                  73.17473339 
##                                U5_population 
##                                  67.59639048 
##                                       hcpi_a 
##                                   0.00000000 
##                                       fcpi_a 
##                                   8.03937654 
##                                        def_a 
##                                   5.90648072 
##                              literacy_female 
##                                  67.67842494 
##                               Total_literacy 
##                                  68.41673503 
##       Current..health.expenditure....of.GDP. 
##                                   5.49630845 
##     Urban.population....of.total.population. 
##                                   0.00000000 
##     Rural.population....of.total.population. 
##                                   0.00000000 
##                       Mortality_rate_under.5 
##                                   0.00000000 
##         Low.birthweight.babies....of.births. 
##                                  29.94257588 
## Unemployment..total....of.total.labor.force. 
##                                  15.91468417

By omitting the predictor variables more than 50% of missing values, the selected socioeconomic factors are:

hcpi_a fcpi_a def_a Current health expenditure(% if GDP) urban population(% of total population) mortality rate under 5 Low birth weight babies unemployment rate

Distribution of response variables

Stunting

# Histograms & density
ggplot(data, aes(x = Stunting_model)) +
  geom_histogram(bins = 30, fill = "skyblue", alpha = 0.7) +
  geom_density(aes(y = ..count..), color = "darkblue") +
  theme_minimal()
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).

The distribution is multi-modal (Multiple peaks) This indicates that different countries or time periods may cluster around different stunting levels.

The highest concentrations of observations appear between 10% and 40% with several mini-peaks

The distribution has a slightly right skew: there are some countries or years with very high stunting percentages, though most values are in the lower-to-mid range.

Long tail extending beyond 50%: high stunting in certain countries or time periods, which might act as outliers in modeling.

overweight

ggplot(data, aes(x = Overweight_model)) +
  geom_histogram(bins = 30, fill = "salmon", alpha = 0.7) +
  geom_density(aes(y = ..count..), color = "darkred") +
  theme_minimal()

Right skewed

There is one clear peak around 0-2% overweight prevalence. (unimodal) not normally distributed the most values appear to be around 0-2%

#Correlation Analysis

df_numeric <- data %>% 
  select(where(is.numeric)) %>% 
  drop_na()


cor_matrix <- cor(df_numeric, use = "complete.obs")
corrplot(cor_matrix, method = "color", type = "lower", tl.cex = 0.5)

According to the correlation plot,

Overweight modeled values are positively correlated with socioeconomic indicators literacy rate, current health expenditure, urban population and negatively correlated with the variables hcpi_a, fcpi_a, def_a, mortality rate under 5, rural population, low birth weight rate, unemployment rate

stunting modeled values are negatively correlated with fcpi_a, hcpi_a,def_a, literacy, current health expenditure, urban population, rural population and positively correlated with mortality rate, low birth weight babies, unemployment rate

#Outlier detction

data %>%
  select(Country, Stunting_model, Overweight_model) %>%
  pivot_longer(cols = c(Stunting_model, Overweight_model), names_to = "indicator") %>%
  ggplot(aes(x = indicator, y = value)) +
  geom_boxplot(fill = "lightblue") +
  theme_minimal()
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

Box plots compare the distributions of overweight and stunting rates across countries.

overweight:

median is around 5-6% and the IQR is narrow multiple outliers clustered around 15-25%,indicating several countries with unusually high overweight rates

stunting:

median is around 25-27% and the IQR is much wider Whiskers extend from about 2% to roughly 62% no visible outliers, suggesting the whiskers capture the full range more normally distributed

patterns:

most countries face low overweight but moderate-to-high stunting few countries have severe overweight problems the dual burden exists but is asymmetric across the global landscape

##By country

Bar chart of average by country

data %>%
  group_by(Country) %>%
  summarise(across(c(Stunting_model, Overweight_model), mean, na.rm = TRUE)) %>%
  pivot_longer(cols = -Country, names_to = "indicator", values_to = "mean_val") %>%
  ggplot(aes(x = reorder(Country, mean_val), y = mean_val, fill = indicator)) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  theme_minimal()
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(c(Stunting_model, Overweight_model), mean, na.rm =
##   TRUE)`.
## ℹ In group 1: `Country = "Afganistan"`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))

High prevalence in stunting than the overweight across the selected countries. Countries at the top (Burundi, Afghanistan… ) have high combined rates of stunting and overweight.

Dual burden pattern can be seen: some countries show significant levels of both indicators.Ex: Egypt, South Africa. This indicates the prevalence of double burden of malnutrition.

Transition pattern: As move down the chart, there is generally a shift from stunting-dominant to more balanced or overweight-dominant patterns, but this is not perfectly linear

This suggests different countries face different malnutrition challenges.

some countries are primarily dealing with undernutrition (stunting), others with a mixed burden.

#Overweight

overweight_data <- data %>%
  select(Country, Year, Overweight_model) %>%
  filter(!is.na(Overweight_model))
#overweight_data
ggplot(data, aes(x = Year, y = Overweight_model, group = Country, color = Country)) +
  geom_line(show.legend = FALSE) +
  theme_minimal() +
  labs(title = "Overweight Trends by Country", y = "Overweight (%)")

countries <- unique(overweight_data$Country)


for(country in countries) {
  country_data <- overweight_data %>% filter(Country == country)
  
  p <- ggplot(country_data, aes(x = Year, y = Overweight_model)) +
    geom_line(color = "steelblue", size = 1) +
    geom_point(color = "steelblue", size = 2) +
    labs(title = paste("Prevalence of Overweight in", country),
         x = "Year",
         y = "Overweight Prevalence (%)") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))
  
  print(p)
  
  
  ggsave(paste0("overweight_", gsub(" ", "_", country), ".png"), plot = p, width = 8, height = 6)
}
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

countries from the same region and the same sub-region shows different prevalence in overweight. ex: Uruguay, Bolivia, Brazil

most of the countries have declining patterns

data$Year <- as.numeric(data$Year)
data_filtered <- data %>%
  filter(Country %in% selected_countries & !is.na(Overweight_model)) %>%
  group_by(Country) %>%
  filter(Year == max(Year, na.rm = TRUE)) %>%
  select(Country, Year, Overweight_model)
world_map <- map_data("world")
highlighted_map <- world_map %>%
  filter(region %in% selected_countries) %>%
  left_join(data_filtered, by = c("region" = "Country"))
ggplot() +
  geom_polygon(
    data = world_map, 
    aes(x = long, y = lat, group = group),
    fill = "lightgray", 
    color = "white",
    linewidth = 0.1
  ) +
  geom_polygon(
    data = highlighted_map %>% mutate(Overweight = as.numeric(Overweight_model)),
    aes(x = long, y = lat, group = group, fill = Overweight),
    color = "black",
    linewidth = 0.2
  ) +
  scale_fill_viridis(
    name = "Overweight %", 
    option = "plasma", 
    direction = -1, 
    na.value = "grey90",
    limits = c(0, 100)  
  ) +
  coord_fixed(ratio = 1.3) +
  theme_minimal() +
  labs(
    title = "Overweight Percentage by Country",
    x = "Longitude", 
    y = "Latitude"
  ) +
  theme(
    legend.position = "bottom",
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 8),
    axis.text = element_blank(),
    axis.ticks = element_blank()
  )

#Stunting

ggplot(data, aes(x = Year, y = Stunting_model, group = country, color = country)) +
  geom_line(show.legend = FALSE) +
  theme_minimal() +
  labs(title = "Stunting Trends by Country", y = "Stunting (%)")

stunting_data <- data %>%
  select(Country, Year, Stunting_model) %>%
  filter(!is.na(Stunting_model))

stunting_data$Stunting <- as.numeric(stunting_data$Stunting)
stunting_data$Year <- as.numeric(as.character(stunting_data$Year))


countries <- unique(stunting_data$Country)


y_max <- max(stunting_data$Stunting, na.rm = TRUE)


for (country in countries) {
  country_data <- stunting_data %>% filter(Country == country)
  
  if (nrow(country_data) < 2) next  # Skip if not enough data

  p <- ggplot(country_data, aes(x = Year, y = Stunting_model)) +
    geom_line(color = "#E69F00", size = 1) +
    geom_point(color = "#E69F00", size = 2) +
    labs(title = paste("Prevalence of Stunting in", country),
         x = "Year",
         y = "Stunting Prevalence (%)") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5)) +
    coord_cartesian(ylim = c(0, y_max))

  print(p)

  
  safe_country <- gsub("[^A-Za-z0-9_]", "_", country)
  ggsave(filename = paste0("stunting_", safe_country, ".png"),
         plot = p, width = 8, height = 6, dpi = 300)
}

almost all countries have similar prevelance of stuning over time

data_filtered_st <- data %>%
  filter(Country %in% selected_countries & !is.na(Stunting_model)) %>%
  group_by(Country) %>%
  filter(Year == max(Year, na.rm = TRUE)) %>%
  select(Country, Year, Stunting_model)
world_map1 <- map_data("world")
highlighted_map1 <- world_map1 %>%
  filter(region %in% selected_countries) %>%
  left_join(data_filtered_st, by = c("region" = "Country"))
highlighted_map <- world_map %>%
  left_join(stunting_data, by = c("region" = "Country"))
## Warning in left_join(., stunting_data, by = c(region = "Country")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 765 of `x` matches multiple rows in `y`.
## ℹ Row 759 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
highlighted_map$Stunting <- as.numeric(highlighted_map$Stunting)


ggplot() +
  geom_polygon(
    data = world_map,
    aes(x = long, y = lat, group = group),
    fill = "lightgray", color = "white", linewidth = 0.1
  ) +
  geom_polygon(
    data = highlighted_map,
    aes(x = long, y = lat, group = group, fill = Stunting),
    color = "black", linewidth = 0.2
  ) +
  scale_fill_viridis(
    name = "Stunting (%)",
    option = "plasma", direction = -1, na.value = "grey90",
    limits = c(0, max(highlighted_map$Stunting, na.rm = TRUE))
  ) +
  coord_fixed(ratio = 1.3) +
  theme_minimal() +
  labs(title = "Stunting Percentage by Country", x = "Longitude", y = "Latitude") +
  theme(
    legend.position = "bottom",
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 8),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    panel.grid = element_blank()
  )